#!/usr/bin/env ruby
#
# Copyright © 2015 Siarhei Siamashka <siarhei.siamashka@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice (including the next
# paragraph) shall be included in all copies or substantial portions of the
# Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

###############################################################################
# Assuming Gaussian distribution of the DRAM clock speeds where the           #
# lima-memtester test starts to detect reliability problems due to variances  #
# between different board samples (but the same board model!), predict how    #
# many boards of the same type are expected to pass/fail the test at each     #
# DRAM clock speed.                                                           #
#                                                                             #
# If a "good" DRAM clock frequency (passes the test) is 672 MHz and a "bad"   #
# clock frequency is 696 MHz, then we are assuming that the cross over point  #
# is somewhere in the middle. So we want to pass the "(672 + 696) / 2" value  #
# into this script. In order to make this easier, there is the <freq_delta>   #
# command line argument, which can be used to add a 12 MHz shift to the       #
# "good" value or subtract 12 MHz from the "bad" value. It is a convenience   #
# feature to avoid maual adjustment of the DRAM clock frequency lists         #
# used as input for this script.                                              #
###############################################################################

require 'tempfile'

if ARGV.size < 1
  printf("Usage: #{$PROGRAM_NAME} <freq_delta> <png_file>\n")
  printf("Where:\n")
  printf("    freq_delta  - adjustment for the frequency.\n")
  printf("    png_file    - resulting PNG file with the plot.\n")
  printf("\n")
  printf("This script reads the list of space separated DRAM clock\n")
  printf("frequency values from STDIN, adds <freq_delta> to them and\n")
  printf("generates a gnuplot chart. Several groups of DRAM clock\n")
  printf("frequencies can be also comma separated and shown on the\n")
  printf("same chart using different colors (blue, green, red, pink).\n")
  printf("\n")
  printf("Example:\n")
  printf("    echo '672 696, 648 648 672' | lima-memtester-genchart 12 out.png\n")
  printf("\n")
  printf("In this example the cumulative distribution function for the data\n")
  printf("set [684 708] will be shown as a blue line and the cumulative\n")
  printf("distribution function for the set [660 660 684] will be shown\n")
  printf("as a green line in the 'out.png' output file.\n")
  exit(1)
end

freq_delta = ARGV[0].to_f
$binning_interval = (freq_delta.abs * 2).to_i

###############################################################################
# Parse html page from mediawiki                                              #
###############################################################################

def html_parse(data)
  out = []
  data.scan(/(<table\s+class=\"wikitable\">)(.*?)(<\/table>)/m) do |_, tbl, _|
    result1 = []
    tbl.scan(/(<tr>)(.*?)(<\/tr>)/m) do |_, row, _|
      tmp = []
      row.scan(/(<th>)(.*?)(<\/th>)/m) { |_, x, _| tmp.push(x.strip) }
      row.scan(/(<td>)(.*?)(<\/td>)/m) { |_, x, _| tmp.push(x.strip) }
      result1.push(tmp)
    end

    if result1.size > 1 && ((result1[0][2] =~ /lima\-memtester passes/) &&
                            (result1[0][3] =~ /lima\-memtester fails/))
      good_table = true

      # Select only rows with exactly 5 columns
      result2 = result1.select { |x| x.size == 5 }

      # The first sanity check
      good_table = false if result2.size != result1.size

      # Remove the table header
      result2.shift if good_table

      # Extract the DRAM clock frequencies (the cross over point)
      result3 = result2.map do |x|
        lo = x[2].gsub(/<[\/]?b>/, "").to_i
        hi = x[3].gsub(/<[\/]?b>/, "").to_i
        $binning_interval = (hi - lo) if $binning_interval == 0
        if lo == 0 || hi == 0 || lo >= hi || (hi - lo) != $binning_interval
          printf("Problems parsing row %s\n", x)
          good_table = false
        end
        (lo + hi).to_f / 2
      end
      out.push(result3) if good_table
    end
  end
  return out
end

###############################################################################
# Chebyshev's inequality for lower semivariance                               #
###############################################################################

# https://en.wikipedia.org/wiki/Chebyshev%27s_inequality#Semivariances
def chebyshev_lower_semivariance_p(samples, val)
  sample_mean   = samples.reduce(:+) / samples.size.to_f
  lower_semivariance = samples.select {|x| x < sample_mean }.reduce(0) {|sum, x|
                          sum + (x - sample_mean) ** 2} / (samples.size - 1)
  a = (sample_mean - val) / Math.sqrt(lower_semivariance)
  if val < sample_mean
    [1.0 / (a * a), 1.0].min
  else
    1.0
  end
end

###############################################################################
# Multinomial test (by calling an external R programming language interpreter #
# with XNomial library). If there are two sets of DRAM clock frequencies      #
# provided then use the first set for training and the second set for testing #
###############################################################################

def multinomial_test(multinomial_data)
  if multinomial_data.size >= 1
    cutoff_prob = 0.0001
    train_idx = 0
    test_idx  = (multinomial_data.size >= 2 ? 1 : 0)
    tmp = {}
    multinomial_data[train_idx][:prob].each do |freq, prob|
      tmp[freq] = [prob, 0] if prob >= cutoff_prob
    end
    multinomial_data[test_idx][:observed].each do |freq, observed|
      prob = (multinomial_data[train_idx][:prob][freq] || 0.0)
      next if observed == 0 && prob < cutoff_prob
      tmp[freq] = [prob, observed]
    end
    tmp = tmp.sort

    # https://cran.r-project.org/web/packages/XNomial/vignettes/XNomial.html
    r_input  = sprintf("library(XNomial)\n\n")
    r_input += sprintf("prob <- c(%s)\n", tmp.map {|x| x[1][0]}.join(", "))
    r_input += sprintf("observed <- c(%s)\n", tmp.map {|x| x[1][1]}.join(", "))
    r_input += sprintf("out <- xmulti(observed, prob, detail=3)\n")
    r_output = `echo '#{r_input}' | R --vanilla 2>/dev/null`

    if r_output
      x_output = ""
      r_output.each_line do |l|
        x_output += "    " + l.strip + "\n" if l =~ /^P value/
      end
      return x_output if x_output != ""
    end
  end
end

###############################################################################
# Cumulative distribution function                                            #
###############################################################################

# https://en.wikipedia.org/wiki/Normal_distribution#Cumulative_distribution_function
def cumulative_distribution_function(x, sample_mean, sample_stddev)
  return 0.5 * (1 + Math.erf((x - sample_mean) / (sample_stddev * Math.sqrt(2))))
end

###############################################################################
# Analyze data, calculate probabilities                                       #
###############################################################################

if ARGV[0] =~ /^http/
  require 'open-uri'
  html_data = open(ARGV[0], &:read)
  data_sets = html_parse(html_data)
else
  data_sets = STDIN.read.split(",").map {|s| s.split.map {|x| x.to_f + freq_delta }}
end

printf("Frequency shift   : %d MHz\n", freq_delta.to_i)
printf("Binning interval  : %d MHz\n", $binning_interval)

multinomial_data = []

data_sets.each do |samples|
  printf("Processing data   : %s\n", samples.map {|x| x.to_i })
  abort "As least two DRAM clock speed samples are needed" if samples.size < 2

  sample_mean   = samples.reduce(:+) / samples.size.to_f
  sample_stddev = Math.sqrt(samples.reduce(0) {|sum, x| sum + (x - sample_mean) ** 2} /
                            (samples.size - 1))

  printf("Sample mean       : %f\n", sample_mean)
  printf("Sample stddev     : %f\n", sample_stddev)

  # Analyze the frequencies range plus/minus six standard deviations from the mean
  mid_freq = sample_mean.to_i / $binning_interval * $binning_interval
  min_freq = mid_freq - (sample_stddev * 6).to_i / $binning_interval * $binning_interval
  max_freq = mid_freq + (sample_stddev * 6).to_i / $binning_interval * $binning_interval

  tmp = {}
  tmp[:mean] = sample_mean
  tmp[:stddev] = sample_stddev
  tmp[:observed] = {}
  tmp[:prob] = {}
  (min_freq..max_freq).step($binning_interval) do |freq|
    cdf2 = cumulative_distribution_function(freq + $binning_interval,
                                            sample_mean, sample_stddev)
    cdf1 = cumulative_distribution_function(freq, sample_mean, sample_stddev)
    tmp[:prob][freq] = cdf2 - cdf1
    tmp[:observed][freq] = samples.reduce(0) do |sum, x|
      sum += ((x >= freq && x < freq + $binning_interval) ? 1 : 0)
    end
  end
  multinomial_data.push(tmp)

  multinomial_results = multinomial_test([multinomial_data[-1]])

  printf("\n")
  printf("{| class=\"wikitable\"\n")
  printf("! rowspan=2 | DRAM clock speed\n")
  printf("! colspan=2 | Percentage of boards failing the lima-memtester test\n")
  printf("! rowspan=2 | Theoretical pessimistic upper bound of the failure " +
         "percentage using Chebyshev's inequality for lower semivariance  <ref name=\"chebyshev\">\n")
  printf("If nothing is known about the distribution of samples, then at least\n")
  printf("[https://en.wikipedia.org/wiki/Chebyshev's_inequality Chebyshev's inequality]\n")
  printf("can be used to get a rough idea about the probabilities of encountering reliability\n")
  printf("problems at different DRAM clock speeds. But this method is <u>very conservative</u>\n")
  printf("and substantially overestimates probabilities (being too generic has its price).\n")
  printf("</ref>\n")
  printf("! rowspan=2 | Histogram\n")
  printf("|-\n")
  printf("! Experimental results\n")
  printf("! Theoretical prediction (assuming Gaussian distribution) <ref name=\"gaussian\">\n")
  printf("We can assume that the [https://en.wikipedia.org/wiki/Normal_distribution Gaussian distribution]\n")
  printf("is a good approximation for our experimental data, calculate theoretical probabilities and do an\n")
  printf("[http://www.biostathandbook.com/exactgof.html exact test of goodness-of-fit]\n")
  printf("to see if the experimental data does not contradict with the theory.\n")
  printf("There is a nice [https://cran.r-project.org/web/packages/XNomial/vignettes/XNomial.html XNomial]\n")
  printf("library for R, which can do the job")
  if multinomial_results
    printf(":<pre>\n#{multinomial_results}</pre>\n")
    printf("If the [https://en.wikipedia.org/wiki/P-value p-values] listed above happen to be too low\n")
    printf("(less than 0.05) and reject our [https://en.wikipedia.org/wiki/Null_hypothesis null hypothesis]\n")
    printf("about having the Gaussian distribution, then Chebyshev's inequality estimates still can be used.\n")
  else
    printf(" (it was not installed on the system where this report had been generated).\n")
  end
  printf("</ref>\n")

  freq = max_freq
  while true
    p = cumulative_distribution_function(freq, sample_mean, sample_stddev)
    pc = chebyshev_lower_semivariance_p(samples, freq)
    break if (p < 0.0001 && pc < 0.01) || freq <= min_freq
    freq -= $binning_interval
  end

  prev_observed = 0
  while freq <= max_freq
    p = cumulative_distribution_function(freq, sample_mean, sample_stddev)
    pc = chebyshev_lower_semivariance_p(samples, freq)
    observed = samples.reduce(0) {|sum, x| x <= freq ? sum + 1 : sum }
    histogram = "*" * (observed - prev_observed)
    prev_observed = observed
    color = ""
    color = "style=\"color: orange;\"" if p >= 0.01 || pc >= 0.05
    color = "style=\"color: red;\"" if observed > 0
    printf("|- %s\n| %.0f MHz || %.2f %% (%d/%d) || %.2f %% || %.2f %% || #{histogram}\n",
           color,
           freq, observed * 100.0 / samples.size, observed, samples.size,
           p * 100, pc * 100)
    break if p >= 0.99
    freq += $binning_interval
  end
  printf("|}\n")
  printf("<references/>\n")

end
